Show the code
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Show the code
library(skimr)
library(survival)
library(survminer)
Loading required package: ggpubr
Attaching package: 'survminer'
The following object is masked from 'package:survival':
myeloma
Show the code
library(fitdistrplus)
Loading required package: MASS
Attaching package: 'MASS'
The following object is masked from 'package:dplyr':
select
Show the code
="/Users/Shared/Survival Analysis"
thePath
= read_csv(paste(thePath, "vodclickstream_uk_movies_03.csv", sep="/")) df
New names:
Rows: 671736 Columns: 8
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(5): title, genres, release_date, movie_id, user_id dbl (2): ...1, duration
dttm (1): datetime
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
Show the code
= read_csv(paste(thePath, "netflix_titles.csv", sep="/")) df2
Rows: 8807 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (11): show_id, type, title, director, cast, country, date_added, rating,...
dbl (1): release_year
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Show the code
# Merging data that contains movie length to normalize watch length
<- merge(df, df2, by = "title")
df
# Data cleaning and preparation
<- subset(df, !grepl("Seasons", duration.y)) # Removing seasons
df $duration.y <- as.numeric(gsub(" min", "", df$duration.y)) # Converting duration to numeric df
Warning: NAs introduced by coercion
Show the code
<- subset(df, duration.x > 0) # Removing invalid durations
df
# Creating columns for analysis
<- df %>%
df mutate(
event = ifelse(duration.x > 0, 1, 0),
genres = as.factor(genres),
minutes_watched = duration.x / 60,
perc_movie_watched = minutes_watched / duration.y,
is_action = ifelse(grepl('Action', genres), 1, 0),
is_adventure = ifelse(grepl('Adventure', genres), 1, 0),
is_comedy = ifelse(grepl('Comedy', genres), 1, 0),
is_documentary = ifelse(grepl('Documentary', genres), 1, 0),
is_drama = ifelse(grepl('Drama', genres), 1, 0),
is_horror = ifelse(grepl('Horror', genres), 1, 0),
is_thriller = ifelse(grepl('Thriller', genres), 1, 0),
is_romance = ifelse(grepl('romance', genres), 1, 0),
is_animation = ifelse(grepl('animation', genres), 1, 0),
is_crime = ifelse(grepl('Crime', genres), 1, 0),
is_scifi = ifelse(grepl('Sci-Fi', genres), 1, 0),
is_sport = ifelse(grepl('Sport', genres), 1, 0),
is_musical = ifelse(grepl('musical', genres), 1, 0),
is_fantasy = ifelse(grepl('Fantasy', genres), 1, 0),
is_mystery = ifelse(grepl('Mystery', genres), 1, 0),
is_biography = ifelse(grepl('Biography', genres), 1, 0),
is_history = ifelse(grepl('History', genres), 1, 0),
is_war = ifelse(grepl('War', genres), 1, 0),
is_western = ifelse(grepl('Western', genres), 1, 0),
is_short = ifelse(grepl('Short', genres), 1, 0)
)
# Cleaning up the percentage of the movie watched
$perc_movie_watched_clean <- round(ifelse(df$perc_movie_watched > 1, 1, df$perc_movie_watched), 2) df